Python爬虫基础

🤔


爬虫介绍

  • 爬虫是一种按照一定的规则,自动地抓取万维网信息的程序或者脚本(来自于百度百科)。

爬虫的目的

  1. 爬取数据,进行市场调研和商业分析。
  2. 作为机器学习、数据挖掘的原始数据
  3. 爬取优质的资源

爬虫实现过程

  • 大部分爬虫都是按“发送请求——获得页面——解析页面——抽取并储存内容”这样的流程来进行,这其实也是模拟了我们使用浏览器获取网页信息的过程。

python中的相关包

  • 在python中有很多爬虫可用的包,如:urllib、requests、bs4、scrapy、pyspider 等

简单的爬虫实例

  • 以下是一个简单的python爬虫,爬取前程无忧上成都unity3d的人才需求:
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    117
    118
    119
    120
    121
    122
    123
    124
    125
    126
    127
    128
    129
    130
    131
    132
    133
    134
    135
    136
    137
    138
    139
    140
    141
    142
    143
    144
    145
    146
    147
    148
    149
    150
    151
    152
    153
    154
    155
    156
    157
    158
    159
    160
    161
    162
    163
    164
    165
    166
    167
    168
    169
    170
    171
    172
    173
    174
    175
    176
    177
    178
    179
    180
    181
    182
    183
    184
    185
    186
    187
    188
    189
    190
    191
    192
    193
    import os
    from pprint import pprint
    import csv
    from collections import Counter
    from bs4 import BeautifulSoup
    import requests
    import matplotlib.pyplot as plt
    import jieba
    from wordcloud import WordCloud

    class JobSpider:

    def __init__(self):
    self.company = []
    self.text = ""
    self.headers = {'X-Requested-With': 'XMLHttpRequest','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
    }

    def job_spider(self):
    """ 爬虫入口
    """
    #要抓取的页面
    url = "https://search.51job.com/list/090200,000000,0000,00,9,99,Unity3d,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
    urls = [url.format(p) for p in range(1, 100)]
    for url in urls:
    #请求网页
    r = requests.get(url, headers=self.headers).content
    #解析
    bs = BeautifulSoup(r, 'lxml').find("div", class_="dw_table").find_all("div", class_="el")
    for b in bs:
    try:
    href, post = b.find('a')['href'], b.find('a')['title']
    locate = b.find('span', class_='t3').text
    salary = b.find('span', class_='t4').text
    d = {
    'href': href,
    'post': post,
    'locate': locate,
    'salary': salary
    }
    self.company.append(d)
    except Exception:
    pass

    def post_require(self):
    """ 爬取职位描述
    """
    for c in self.company:
    r = requests.get(
    c.get('href'), headers=self.headers).content.decode('gbk')
    bs = BeautifulSoup(r, 'lxml').find(
    'div', class_="bmsg job_msg inbox").text
    s = bs.replace("举报", "").replace("分享", "").replace("\t", "").strip()
    self.text += s
    print(self.text)
    with open(os.path.join("data", "post_require.txt"), "w+") as f:
    f.write(self.text)

    @staticmethod
    def post_desc_counter():
    """ 职位描述统计
    """
    post = open(os.path.join("data", "post_require.txt"),
    "r").read()

    # 使用 jieba 分词
    file_path = os.path.join("data", "user_dict.txt")
    jieba.load_userdict(file_path)
    seg_list = jieba.cut(post, cut_all=False)
    counter = dict()
    for seg in seg_list:
    counter[seg] = counter.get(seg, 1) + 1
    counter_sort = sorted(
    counter.items(), key=lambda value: value[1], reverse=True)
    pprint(counter_sort)
    with open(os.path.join("data", "post_pre_desc_counter.csv"), "w+", encoding="utf-8") as f:
    f_csv = csv.writer(f)
    f_csv.writerows(counter_sort)

    def post_counter(self):
    """ 职位统计
    """
    lst = [c.get('post') for c in self.company]
    counter = Counter(lst)
    counter_most = counter.most_common()
    pprint(counter_most)
    with open(os.path.join("data", "post_pre_counter.csv"),
    "w+", encoding="utf-8") as f:
    f_csv = csv.writer(f)
    f_csv.writerows(counter_most)

    def post_salary_locate(self):
    """ 招聘大概信息,职位,薪酬以及工作地点
    """
    lst = []
    for c in self.company:
    lst.append((c.get('salary'), c.get('post'),c.get('title'), c.get('href'), c.get('locate')))
    pprint(lst)
    file_path = os.path.join("data", "post_salary_locate.csv")
    with open(file_path, "w+") as f:
    f_csv = csv.writer(f)
    f_csv.writerows(lst)

    @staticmethod
    def post_salary():
    """ 薪酬统一处理
    """
    mouth = []
    year = []
    thousand = []
    with open(os.path.join("data", "post_salary_locate.csv"),
    "r", encoding="utf-8") as f:
    f_csv = csv.reader(f)
    for row in f_csv:
    if "万/月" in row[0]:
    mouth.append((row[0][:-3], row[2], row[1]))
    elif "万/年" in row[0]:
    year.append((row[0][:-3], row[2], row[1]))
    elif "千/月" in row[0]:
    thousand.append((row[0][:-3], row[2], row[1]))
    # pprint(mouth)

    calc = []
    for m in mouth:
    s = m[0].split("-")
    calc.append(
    (round(
    (float(s[1]) - float(s[0])) * 0.4 + float(s[0]), 1),
    m[1], m[2]))
    for y in year:
    s = y[0].split("-")
    calc.append(
    (round(
    ((float(s[1]) - float(s[0])) * 0.4 + float(s[0])) / 12, 1),
    y[1], y[2]))
    for t in thousand:
    s = t[0].split("-")
    calc.append(
    (round(
    ((float(s[1]) - float(s[0])) * 0.4 + float(s[0])) / 10, 1),
    t[1], t[2]))
    pprint(calc)
    with open(os.path.join("data", "post_salary.csv"),
    "w+", encoding="utf-8") as f:
    f_csv = csv.writer(f)
    f_csv.writerows(calc)

    @staticmethod
    def post_salary_counter():
    # 薪酬统计
    with open(os.path.join("data", "post_salary.csv"),
    "r", encoding="utf-8") as f:
    f_csv = csv.reader(f)
    lst = [row[0] for row in f_csv]
    counter = Counter(lst).most_common()
    pprint(counter)
    with open(os.path.join("data", "post_salary_counter1.csv"),
    "w+", encoding="utf-8") as f:
    f_csv = csv.writer(f)
    f_csv.writerows(counter)

    @staticmethod
    def world_cloud():
    """ 生成词云
    """
    counter = {}
    with open(os.path.join("data", "post_pre_desc_counter.csv"),
    "r", encoding="utf-8") as f:
    f_csv = csv.reader(f)
    for row in f_csv:
    counter[row[0]] = counter.get(row[0], int(row[1]))
    pprint(counter)
    file_path = os.path.join("font", "msyh.ttf")
    wc = WordCloud(font_path=file_path,
    max_words=100,
    height=600,
    width=1200)
    wc.generate_from_frequencies(counter)
    plt.imshow(wc)
    plt.axis('off')
    plt.show()
    wc.to_file(os.path.join("images", "wc.jpg"))



    if __name__ == "__main__":
    spider = JobSpider()
    spider.job_spider()
    spider.post_salary_locate()
    spider.post_salary()
    spider.post_salary_counter()
    spider.post_counter()
    spider.world_cloud()
宇 wechat
扫描二维码,订阅微信公众号